library('data.table')
library('zoo')

d <- fread('compustat_data.csv') # the source file downloaded from WRDS

d <- d[fic == 'USA']
d <- d[sic > 4999 | sic < 4900]
d <- d[sic < 6900 | sic > 6999]

d <- d[sale > 0 & emp > 0 & at > 0 & ppegt > 0 & xopr > 0]

####### ####### ####### ####### ####### 
####### Investment accumulation
####### ####### ####### ####### ####### 

count <- d[,.(nobs1 = sum(!is.na(ppegt)), nobs2 = sum(!is.na(ppent))), by = 'gvkey']
d <- merge(d, count, by = 'gvkey')
d <- d[nobs1 >= 1 & nobs2 > 1]

dates <- unique(d[,'fyear'])
setkey(dates, fyear)
N <- dim(dates)[1]
dates$period <- 1:N
d <- merge(d, dates, by = 'fyear')

d$date <- d$fyear

# investment deflator
invdef <- fread('A008RD3Q086SBEA.csv')
invdef$date <- as.numeric(substr(invdef$DATE, 1, 4))
invdef$DATE <- NULL
names(invdef)[1] <- 'INVDEF'
invdef <- invdef[date >= 1963 & date <= 2018]
invdef$INVDEF <- invdef$INVDEF/100

# iterate on capital accumulation equation
source('per_inv.R')
source('impute.R')

dt <- d[,.(period = 1:N, cum_inv = per_inv(ppegt, ppent, invdef$INVDEF, period,N)), by = 'gvkey']
dt <- dt[complete.cases(dt)]
d <- merge(d, dt, by = c('period','gvkey'))

####### ####### ####### ####### ####### 
####### Get FF-49 industry classification
####### ####### ####### ####### ####### 

ff49 <- fread('sic_2_ff49.csv')
d <- merge(d, ff49, by = 'sic')


####### ####### ####### ####### ####### 
####### Get GDP Deflator
####### ####### ####### ####### ####### 


#### Fred data ####
gdpdef <- fread('GDPDEF.csv')
gdpdef$date <- as.numeric(substr(gdpdef$DATE, 1, 4))
gdpdef$DATE <- NULL

d <- merge(d, gdpdef, by = 'date')

write.csv(d, 'annual_data_markups.csv')
